import glob
import json
import csv
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from sklearn.cluster import KMeans
# Handling the mulitple language used in the different paper on the same domain/area of study
from tqdm import tqdm
from langdetect import detect
from langdetect import DetectorFactory
plt.style.use('ggplot')
# figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
root_path = 'C:/Users/Dell/Desktop/Covid-19/CORD-19-research-challenge'
metadata_path = f'{root_path}\metadata.csv'
meta_df= pd.read_csv(metadata_path,dtype={
'pubmed_id':str,
'Microsoft Academic Paper ID':str,
'doi':str
})
all_json= glob.glob(f'{root_path}/**/*.json',recursive=True)
len(all_json)
# to Lower Function to convert the references and title to lower case for smplicity
def lower_case(input_str):
input_str = input_str.lower()
return input_str
def get_breaks(content,length):
data = ""
words = content.split(' ')
total_chars = 0
for i in range(len(words)):
total_chars += len(words[i])
if total_chars > length:
data = data + "<br>" + words[i]
total_chars = 0
else:
data += " " + words[i]
return data
class FileReader:
def __init__(self,file_path):
with open(file_path) as file:
content = json.load(file)
self.paper_id = content['paper_id'].strip()
self.title = content['metadata']['title'].strip()
self.abstract = []
self.body_text = []
self.references = []
# Abstract
for entry in content['abstract']:
self.abstract.append(entry['text'])
# Body text
for entry in content['body_text']:
self.body_text.append(entry['text'])
# add the each reference name in the reference list
for key,value in enumerate(content['bib_entries']):
self.references.append(lower_case(content['bib_entries'][value]['title'].strip()))
self.abstract= '\n'.join(self.abstract)
self.body_text = '\n'.join(self.body_text)
self.references = '\n'.join(self.references)
def __repr__(self):
return f'{self.paper_id}: {self.title[:200]}...'
# Print the first row of the Data set
first_row = FileReader(all_json[0])
print(first_row.references);
# Create a Dataframe of the complete database
dict_ = {
'paper_id':[],
'abstract':[],
'body_text':[],
'authors':[],
'references':[],
'title':[],
'journal':[],
'abstract_summary':[]
}
for idx,entry in enumerate(all_json):
if idx % (len(all_json)//10) == 0:
print(f'Processing index : {idx} of {len(all_json)}')
content = FileReader(entry)
# get the metadata information
meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
# no metadata., skip the paper
if len(meta_data) == 0:
continue
dict_['paper_id'].append(content.paper_id)
dict_['abstract'].append(content.abstract)
dict_['body_text'].append(content.body_text)
dict_['references'].append(content.references)
# also create the a column for the summary of abstract to be used in a plot
if len(content.abstract) == 0:
dict_['abstract_summary'].append("Not Provided.")
elif len(content.abstract.split(' ')) > 100:
info = content.abstract.split(' ')[:100]
summary = get_breaks(' '.join(info), 40)
dict_['abstract_summary'].append(summary + "...")
else:
# abstract is short enough
summary = get_breaks(content.abstract, 40)
dict_['abstract_summary'].append(summary)
# get metadata information
meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
try:
# if more than one author
authors = meta_data['authors'].values[0].split(';')
if len(authors) > 2:
# more than 2 authors, may be problem when plotting, so take first 2 append with ...
dict_['authors'].append(". ".join(authors[:2]) + "...")
else:
# authors will fit in plot
dict_['authors'].append(". ".join(authors))
except Exception as e:
# if only one author - or Null valie
dict_['authors'].append(meta_data['authors'].values[0])
# add the title information, add breaks when needed
try:
title = meta_data['title'].values[0]
dict_['title'].append(title)
# if title was not provided
except Exception as e:
dict_['title'].append(meta_data['title'].values[0])
# add the journal information
dict_['journal'].append(meta_data['journal'].values[0])
print("Completed Processing!")
# Create the data frame out of the dictationary and save it to the file
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text', 'authors','references','title', 'journal', 'abstract_summary'])
df_covid.to_csv('Covid_19_Dataset_V1.csv')
# df_covid = pd.DataFrame.from_dict(dict_, orient='index')
# df_covid.transpose()
# load the csv file that contant the dataframe of the data created by me
custom_df =pd.read_csv('Covid_19_Dataset_V1.csv',index_col=0)
custom_df.head(2)
mask = (custom_df['title'].str.len() >= 5)
temp_df = custom_df.loc[mask]
# Functions to remove the special character from the title string
import re
# feature engineering
custom_df['title'] = custom_df['title'].apply(lambda x: lower_case(str(x)))
custom_df.head(3)
from collections import defaultdict
my_reference_dict = defaultdict(list)
matchCount = 0
for j in range(len(custom_df)):
main_paper_id = custom_df.iloc[j, 0]
title = custom_df.iloc[j,5]
if j % (len(custom_df)//10) == 0:
print(f'Processing index : {j} of {len(custom_df)}')
matchCount = 0
if len(title) >=5:
for i in range(len(custom_df)):
myRefPaperId = custom_df.iloc[i, 0]
myReferences = custom_df.iloc[i,4]
for line in myReferences.split('\n'):
if title.lower() == line.lower():
matchCount = matchCount+1
my_reference_dict[main_paper_id].append(myRefPaperId)
print(f'Match Count: {matchCount}')
print ("Completed")
# save the dict as csv
w = csv.writer(open("graphListV1.csv", "w"))
for key, val in my_reference_dict.items():
w.writerow([key, val])
# save the dict as json
json = json.dumps(my_reference_dict)
f = open("graphListV1.json","w")
f.write(json)
f.close()
# save the dict as
f = open("graphListV1.txt","w")
f.write(str(my_reference_dict))
f.close()
# # save the dict as pickle file
# f = open("graphListV1.pkl","wb")
# pickle.dump(my_reference_dict,f)
# f.close()
# read the json file stored on the local system
f = open('graphListV1.json')
# returns JSON object as
# a dictionary
data = json.load(f)
# display the data
# print(data)
# now parse the JSON object to create a dataframe for Graph plotting
fromList = []
toList = []
for key,value in data.items():
for item in value:
fromList.append(key)
toList.append(item)
directed_df = pd.DataFrame({ 'from':fromList, 'to':toList})
# total Nodes in the Graph
node_list = set(fromList)
len(node_list)
# display the df
directed_df['to'].dropna()
#plot the graph
plt.figure(figsize=(40,40))
G = nx.from_pandas_edgelist(directed_df[:20000], 'from', 'to', create_using=nx.DiGraph())
# save in the graphml format
nx.write_graphml(G, "covid.graphml")
# save in the edge list format
nx.write_edgelist(G, "covid.edgelist")
G1 = nx.from_pandas_edgelist(directed_df[:2000], 'from', 'to', create_using=nx.DiGraph())
pos = nx.spring_layout(G1)
colors = range(2000)
options = {
"node_color": "red",
"edge_color": 'black',
"width": 10,
"edge_cmap": plt.cm.Blues,
"with_labels": False,
"node_size":1500,
"alpha":0.6,
"arrows":True,
"arrowsize":30,
"arrowstyle":'->'
}
nx.draw(G1,pos,**options)
ax = plt.gca()
ax.set_axis_off()
plt.show()
nx.info(G)
nx.is_weakly_connected(G)
print("Number of Edges:",nx.number_of_edges(G))
print("Number of Nodes:",nx.number_of_nodes(G))
print("Average Clustering Coeffient:",round(nx.average_clustering(G),3))
print("Density of the Graph:",round(nx.density(G),3))
# Degree Histogram
import collections
degree_sequence = [d for n, d in G.degree()] # degree sequence
degreeCount = collections.Counter(degree_sequence)
deg, cnt = zip(*degreeCount.items())
fig, ax = plt.subplots()
plt.figure(figsize=(15,10))
plt.loglog(deg, cnt,'o')
plt.title("Degree Histogram")
plt.ylabel("Number of Nodes")
plt.xlabel("Degree")
plt.show()
# degree= H.degree()
degree_sequence = sorted([d for n, d in G.degree()], reverse=True)
dmax = max(degree_sequence)
plt.loglog(degree_sequence, "b-", marker="o")
plt.title("Degree rank plot")
plt.ylabel("degree")
plt.xlabel("rank")
plt.show()
# Degree Centrality, Betweenness Centrality and Eigenvector centrality
# A central actor is one that acts as a bridge, broker or gatekeeper.
# A central actor is connected to other central actors.
# A central node is one that is close, on average, to other nodes.
# print("Betweenness")
b = list(nx.betweenness_centrality(G).values())
# for v in G.nodes():
# print(f"{v:2} {b[v]:.3f}")
# print("Degree centrality")
d = list(nx.degree_centrality(G).values())
# for v in G.nodes():
# print(f"{v:2} {d[v]:.3f}")
# print("Closeness centrality")
c = list(nx.closeness_centrality(G).values())
# for v in G.nodes():
# print(f"{v:2} {c[v]:.3f}")
plt.figure(figsize=(10,10))
sns.distplot(a = b ,label='Betweenness',color='black',hist=True,kde=False)
sns.distplot(a = d ,label='Degree centrality',color='red',hist=True,kde=False)
sns.distplot(a = c ,label='Closeness centrality',color='yellow',hist=True,kde=False)
plt.ylabel("Number of Nodes")
plt.xlabel("Value")
# Add title
plt.title("Histogram of Betweenness,Degree centrality, Closeness centrality")
# Force legend to appear
plt.legend()
def Average(lst):
return sum(lst) / len(lst)
avg_b1 = Average(b)
avg_b1
avg_d1 = Average(d)
avg_c1 = Average(c)
print("Average Betweenness:",round(avg_b1,5))
print("Average Degree centrality",round(avg_d1,5))
print("Average Closeness centrality",round(avg_c1,5))
degrees = [d for n, d in G.degree()]
clusteringCoefficient = list(nx.clustering(G).values())
data = {'Degree':degrees, 'Clustering Coefficient':clusteringCoefficient}
# Create DataFrame
graph_df = pd.DataFrame(data,columns=['Degree','Clustering Coefficient'])
graph_df.head(2)
plt.figure(figsize=(20,20))
plt.xlabel("Number of Nodes")
plt.ylabel("Degree")
plt.title("Clustering Coefficient Vs Degree")
# sns.scatter(data=df.column1, color="g")
# ax2 = plt.twinx()
# sns.lineplot(data=df.column2, color="b", ax=ax2)
ax = sns.scatterplot(data=graph_df)
print(f'Transitivity of the network: {round(nx.transitivity(G),3)}')
# print(f'Transitivity of the network: {round(nx.transitivity(G),3)}')
# load the single connected giant graph
giantG = nx.read_pajek("SingleComponent.net")
largest_cc = max(nx.weakly_connected_components(giantG), key=len)
H = G.subgraph(largest_cc)
plt.figure(figsize=(40,40))
pos = nx.spring_layout(H)
colors = range(2000)
options = {
"node_color": "red",
"edge_color": 'black',
"width": 10,
"edge_cmap": plt.cm.Blues,
"with_labels": False,
"node_size":1500,
"alpha":0.6,
"arrows":True,
"arrowsize":30,
"arrowstyle":'->'
}
nx.draw(H,pos,**options)
ax = plt.gca()
ax.set_axis_off()
plt.show()
nx.info(H)
print("Number of Edges:",nx.number_of_edges(H))
print("Number of Nodes:",nx.number_of_nodes(H))
print("Average Path length:",round(nx.average_shortest_path_length(H),5))
print("Average Clustering Coeffient:",round(nx.average_clustering(H),5))
print("Density of the Graph:",round(nx.density(H),5))
def avg_degree_cal(degree):
count = 0
for node,value in enumerate(degree):
count= count+value[1]
return round(count/len(degree),3)
n = 1133
p = 0.36
rgraph = nx.gnp_random_graph(n,p)
plt.figure(figsize=(40,40))
pos = nx.spring_layout(rgraph)
colors = range(2000)
options = {
"node_color": "red",
"edge_color": 'black',
"width": 10,
"edge_cmap": plt.cm.Blues,
"with_labels": False,
"node_size":1500,
"alpha":0.6,
"arrows":True,
"arrowsize":30,
"arrowstyle":'->'
}
nx.draw(rgraph,pos,**options)
ax = plt.gca()
ax.set_axis_off()
plt.show()
import random
# Create random graph with different probabilties
# let say we are checking on 100 random graph to compare with
testNumber = 25
avg_path_length = 0
avg_clustering_coefficient = []
avg_degree_distribution = 0
avg_density = 0
# connectivity with p = 0.2
n = 1133
# p = random.random()
# print(f'Probability {p}')
# rand_graph = nx.gnp_random_graph(n,p)
# avg_path_length = avg_path_length+ round(nx.average_shortest_path_length(rand_graph),3)
# avg_clustering_coefficient = avg_clustering_coefficient + round(nx.average_clustering(rand_graph),3)
# avg_degree_distribution = avg_degree_distribution + avg_degree_cal(rand_graph.degree())
# avg_density = avg_density + nx.density(rand_graph)
for i in range(testNumber):
p = random.random()
print(f'Probability {p}')
rand_graph = nx.gnp_random_graph(n,p)
avg_path_length = avg_path_length + round(nx.average_shortest_path_length(rand_graph),3)
avg_clustering_coefficient.append(round(nx.average_clustering(rand_graph),3))
# avg_degree_distribution = avg_degree_distribution + avg_degree_disavg_degree_cal(rand_graph.degree())
# avg_density = avg_density + nx.density(rand_graph)
avg_clustering_coefficient
print("Average Shortest Path:",round(avg_path_length/testNumber,3))
print("Mean Clustering Coefficient:",round(np.sum(avg_clustering_coefficient)/testNumber,3))
# load the csv file that contant the dataframe of the data created by me
custom_df =pd.read_csv('Covid_19_Dataset_V1.csv',index_col=0)
custom_df.head(2)
# some Feature Engineering
custom_df['abstract_word_count'] = custom_df['abstract'].apply(lambda x: len(str(x).strip().split())) # word count in abstract
custom_df['body_word_count'] = custom_df['body_text'].apply(lambda x: len(str(x).strip().split())) # word count in body
custom_df['body_unique_words']=custom_df['body_text'].apply(lambda x:len(set(str(x).split()))) # number of unique words in body
custom_df.head()
custom_df.info()
# Handling the duplicates
custom_df.drop_duplicates(['abstract', 'body_text'], inplace=True)
custom_df['abstract'].describe(include='all')
custom_df.describe()
custom_df.dropna(inplace=True)
custom_df.info()
# set seed
DetectorFactory.seed = 0
# hold label - language
languages = []
# go through each text
for ii in tqdm(range(0,len(custom_df))):
# split by space into list, take the first x intex, join with space
text = custom_df.iloc[ii]['body_text'].split(" ")
lang = "en"
try:
if len(text) > 50:
lang = detect(" ".join(text[:50]))
elif len(text) > 0:
lang = detect(" ".join(text[:len(text)]))
# ught... beginning of the document was not in a good format
except Exception as e:
all_words = set(text)
try:
lang = detect(" ".join(all_words))
# what!! :( let's see if we can find any text in abstract...
except Exception as e:
try:
# let's try to label it through the abstract then
lang = detect(df.iloc[ii]['abstract_summary'])
except Exception as e:
lang = "unknown"
pass
# get the language
languages.append(lang)
from pprint import pprint
languages_dict = {}
for lang in set(languages):
languages_dict[lang] = languages.count(lang)
print("Total: {}\n".format(len(languages)))
pprint(languages_dict)
# plot different language used
custom_df['language'] = languages
plt.bar(range(len(languages_dict)), list(languages_dict.values()), align='center')
plt.xticks(range(len(languages_dict)), list(languages_dict.keys()))
plt.title("Distribution of Languages in Dataset")
plt.xlabel("Languages")
plt.ylabel("Number of Papers")
plt.show()
# we only want to see the paper which are written in english only
custom_df = custom_df[custom_df['language'] == 'en']
custom_df.info()
#Download the spacy bio parser
from IPython.utils import io
with io.capture_output() as captured:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz
#NLP
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_sci_lg # model downloaded in previous step
import string
punctuations = string.punctuation
stopwords = list(STOP_WORDS)
stopwords[:10]
custom_stop_words = [
'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure',
'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.',
'al.', 'Elsevier', 'PMC', 'CZI', 'www'
]
for w in custom_stop_words:
if w not in stopwords:
stopwords.append(w)
# Parser
parser = en_core_sci_lg.load(disable=["tagger", "ner"])
parser.max_length = 7000000
def spacy_tokenizer(sentence):
mytokens = parser(sentence)
mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
mytokens = " ".join([i for i in mytokens])
return mytokens
from sklearn.feature_extraction.text import TfidfVectorizer
def vectorize(text, maxx_features):
vectorizer = TfidfVectorizer(max_features=maxx_features)
X = vectorizer.fit_transform(text)
return X
tqdm.pandas()
test_df = custom_df[:5000]
test_df["processed_text"] = test_df["body_text"].progress_apply(spacy_tokenizer)
sns.distplot(test_df['body_word_count'])
test_df['body_word_count'].describe()
sns.distplot(test_df['body_unique_words'])
test_df['body_unique_words'].describe()
test_df['processed_text']
text = test_df['processed_text'].values
X = vectorize(text, 2 ** 12)
X.shape
# Applying the principle component anaysis
# Vectorize our data. We will be clustering based off the content of the body text.
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95, random_state=42)
X_reduced= pca.fit_transform(X.toarray())
X_reduced.shape
To find the best k value for k-means we'll look at the distortion at different k values. Distortion computes the sum of squared distances from each point to its assigned center. When distortion is plotted against k there will be a k value after which decreases in distortion are minimal. This is the desired number of clusters.
from sklearn import metrics
from scipy.spatial.distance import cdist
# run kmeans with many different k
distortions = []
K = range(2, 50)
for k in K:
k_means = KMeans(n_clusters=k, random_state=42).fit(X_reduced)
k_means.fit(X_reduced)
distortions.append(sum(np.min(cdist(X_reduced, k_means.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
print('Found distortion for {} clusters'.format(k))
# Distortion values
distortions
X_line = [K[0], K[-1]]
Y_line = [distortions[0], distortions[-1]]
# Plot the elbow
plt.plot(K, distortions, 'b-')
plt.plot(X_line, Y_line, 'r')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()
In this plot we can see that the better k values are between 20-25. After that, the decrease in distortion is not as significant. For simplicity, we will use k=20
k = 20
kmeans = KMeans(n_clusters=k, random_state=42)
y_pred = kmeans.fit_predict(X_reduced)
test_df['y'] = y_pred
test_df['y'].head(5)
Using the t-SNE we can reduce our high dimensional features vector to 2 dimensions. By using the 2 dimensions as x,y coordinates, the body_text can be plotted.
from sklearn.manifold import TSNE
tsne = TSNE(verbose=1, perplexity=100, random_state=42)
X_embedded = tsne.fit_transform(X.toarray())
X_embedded
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
# sns settings
sns.set(rc={'figure.figsize':(15,15)})
# colors
palette = sns.hls_palette(20, l=.4, s=.9)
# plot
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y_pred, legend='full', palette=palette)
plt.title('t-SNE with Kmeans Labels')
plt.savefig("improved_cluster_tsne.png")
plt.show()
There are some clusters we can immediately detect, but the many instances closer to the center are harder to separate.So we have used the clusters found by k-means as labels. To visually separate different concentrations of topics.